import pandas as pd
import numpy as np
import matplotlib as mtp
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')
data1 = pd.read_csv('D:/Project 2024/Project Unifield M/Project 1_Analyzing Amazon Sales/Amazon Sales analysis.csv')
data1.head(3)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Month-Year | Year | Order ID | Ship date | Month-Year.1 | Year.1 | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Australia and Oceania | Tuvalu | Baby Food | Offline | H | 28-05-2010 | May-10 | 2010 | 669165933 | 27-06-2010 | Jun-10 | 2010 | 9925 | 255 | 159 | 2533654 | 1582244 | 951411 |
| 1 | Central America and the Caribbean | Grenada | Cereal | Online | C | 22-08-2012 | Aug-12 | 2012 | 963881480 | 15-09-2012 | Sep-12 | 2012 | 2804 | 206 | 117 | 576783 | 328376 | 248406 |
| 2 | Europe | Russia | Office Supplies | Offline | L | 02-05-2014 | May-14 | 2014 | 341417157 | 08-05-2014 | May-14 | 2014 | 1779 | 651 | 525 | 1158503 | 933904 | 224599 |
#---------------------Order date - ship date day in ship date--------------
data1.tail(3)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Month-Year | Year | Order ID | Ship date | Month-Year.1 | Year.1 | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 97 | Sub-Saharan Africa | Sierra Leone | Vegetables | Offline | C | 01-06-2016 | Jun-16 | 2016 | 728815257 | 29-06-2016 | Jun-16 | 2016 | 1485 | 154 | 91 | 228779 | 135031 | 93748 |
| 98 | North America | Mexico | Personal Care | Offline | M | 30-07-2015 | Jul-15 | 2015 | 559427106 | 08-08-2015 | Aug-15 | 2015 | 5767 | 82 | 57 | 471337 | 326816 | 144521 |
| 99 | Sub-Saharan Africa | Mozambique | Household | Offline | L | 10-02-2012 | Feb-12 | 2012 | 665095412 | 15-02-2012 | Feb-12 | 2012 | 5367 | 668 | 503 | 3586605 | 2697132 | 889473 |
data1.shape
(100, 18)
print(f'Total Of Rows in dataset is-{100}' )
print(f'Total Of Column in dataset is-{18}' )
Total Of Rows in dataset is-100 Total Of Column in dataset is-18
data1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 100 non-null object 1 Country 100 non-null object 2 Item Type 100 non-null object 3 Sales Channel 100 non-null object 4 Order Priority 100 non-null object 5 Order Date 100 non-null object 6 Month-Year 100 non-null object 7 Year 100 non-null int64 8 Order ID 100 non-null int64 9 Ship date 100 non-null object 10 Month-Year.1 100 non-null object 11 Year.1 100 non-null int64 12 Units Sold 100 non-null int64 13 Unit Price 100 non-null int64 14 Unit Cost 100 non-null int64 15 Total Revenue 100 non-null int64 16 Total Cost 100 non-null int64 17 Total Profit 100 non-null int64 dtypes: int64(9), object(9) memory usage: 14.2+ KB
data1['Order Date']=pd.to_datetime(data1['Order Date'])
data1['Ship date']=pd.to_datetime(data1['Ship date'])
data1.isnull().sum()
Region 0 Country 0 Item Type 0 Sales Channel 0 Order Priority 0 Order Date 0 Month-Year 0 Year 0 Order ID 0 Ship date 0 Month-Year.1 0 Year.1 0 Units Sold 0 Unit Price 0 Unit Cost 0 Total Revenue 0 Total Cost 0 Total Profit 0 dtype: int64
data1[['Total Revenue','Units Sold','Total Profit']].describe()
| Total Revenue | Total Profit | Units Sold | |
|---|---|---|---|
| count | 1.000000e+02 | 1.000000e+02 | 100.000000 |
| mean | 1.373488e+06 | 4.416820e+05 | 5128.710000 |
| std | 1.460029e+06 | 4.385379e+05 | 2794.484562 |
| min | 4.870000e+03 | 1.258000e+03 | 124.000000 |
| 25% | 2.687210e+05 | 1.214438e+05 | 2836.250000 |
| 50% | 7.523145e+05 | 2.907680e+05 | 5382.500000 |
| 75% | 2.212045e+06 | 6.358290e+05 | 7369.000000 |
| max | 5.997055e+06 | 1.719922e+06 | 9925.000000 |
Revenue=data1['Total Revenue'].sum()
print('Total Revenue=',Revenue)
Total Revenue= 137348766
Profit=data1['Total Profit'].sum()
print('Total Profit=',Profit)
Total Profit= 44168199
Cost=data1['Total Cost'].sum()
print('Total Cost=',Cost)
Total Cost= 93180569
data1.groupby(by=['Region'])[['Total Revenue','Total Profit']].sum().sort_values(by='Total Profit',ascending=False)
| Total Revenue | Total Profit | |
|---|---|---|
| Region | ||
| Sub-Saharan Africa | 39672031 | 12183213 |
| Europe | 33368930 | 11082938 |
| Asia | 21347091 | 6113846 |
| Middle East and North Africa | 14052706 | 5761191 |
| Australia and Oceania | 14094265 | 4722161 |
| Central America and the Caribbean | 9170386 | 2846908 |
| North America | 5643357 | 1457942 |
data1.groupby(by=['Country'])[['Units Sold','Total Revenue','Total Profit']].sum().head(10).sort_values(by='Total Profit',ascending=False)
| Units Sold | Total Revenue | Total Profit | |
|---|---|---|---|
| Country | |||
| Azerbaijan | 9255 | 4478800 | 1512927 |
| Brunei | 6708 | 4368317 | 846885 |
| Angola | 4187 | 2798046 | 693912 |
| Bulgaria | 5660 | 2779199 | 626224 |
| Bangladesh | 8263 | 902981 | 606835 |
| Australia | 12995 | 2489933 | 576605 |
| Burkina Faso | 8082 | 1245113 | 510217 |
| Austria | 2847 | 1244708 | 495008 |
| Belize | 5498 | 600821 | 403773 |
| Albania | 2269 | 247956 | 166635 |
data1.groupby(by=['Item Type'])[['Units Sold','Total Revenue','Total Profit']].sum().sort_values(by='Total Profit',ascending=False)
| Units Sold | Total Revenue | Total Profit | |
|---|---|---|---|
| Item Type | |||
| Cosmetics | 83718 | 36601508 | 14556048 |
| Household | 44727 | 29889713 | 7412606 |
| Office Supplies | 46967 | 30585380 | 5929586 |
| Clothes | 71260 | 7787292 | 5233333 |
| Baby Food | 40545 | 10350328 | 3886645 |
| Cereal | 25877 | 5322898 | 2292443 |
| Vegetables | 20051 | 3089056 | 1265819 |
| Personal Care | 48708 | 3980905 | 1220622 |
| Beverages | 56708 | 2690795 | 888048 |
| Snacks | 13637 | 2080734 | 751945 |
| Meat | 10675 | 4503676 | 610610 |
| Fruits | 49998 | 466481 | 120494 |
Count=data1['Item Type'].value_counts()
print('Count of Items Types\n',Count)
Count of Items Types Item Type Clothes 13 Cosmetics 13 Office Supplies 12 Fruits 10 Personal Care 10 Household 9 Beverages 8 Baby Food 7 Cereal 7 Vegetables 6 Snacks 3 Meat 2 Name: count, dtype: int64
data1.groupby(by=['Sales Channel'])[['Units Sold','Total Revenue','Total Profit']].sum()
| Units Sold | Total Revenue | Total Profit | |
|---|---|---|---|
| Sales Channel | |||
| Offline | 276782 | 79094808 | 24920727 |
| Online | 236089 | 58253958 | 19247472 |
D6=data1.groupby(by=['Sales Channel','Country'])[['Total Revenue','Total Profit']].sum().sort_values(by='Country',ascending=False)
D6
| Total Revenue | Total Profit | ||
|---|---|---|---|
| Sales Channel | Country | ||
| Online | Zambia | 623289 | 225247 |
| United Kingdom | 188452 | 46736 | |
| Offline | Tuvalu | 2533654 | 951411 |
| Online | Turkmenistan | 3262562 | 632513 |
| Offline | Turkmenistan | 2559474 | 634746 |
| ... | ... | ... | |
| Austria | 1244708 | 495008 | |
| Australia | 585795 | 207450 | |
| Online | Australia | 1904138 | 369155 |
| Offline | Angola | 2798046 | 693912 |
| Online | Albania | 247956 | 166635 |
87 rows × 2 columns
data.groupby(by=['Order Priority'])[['Units Sold','Total Revenue','Total Profit']].sum().sort_values(by='Total Profit',ascending=False)
| Units Sold | Total Revenue | Total Profit | |
|---|---|---|---|
| Order Priority | |||
| H | 154212 | 48749543 | 16891601 |
| L | 146876 | 36628129 | 10858727 |
| M | 94832 | 33116030 | 9669543 |
| C | 116951 | 18855064 | 6748328 |
data1.groupby(by=['Year'])[['Total Revenue','Total Profit']].sum()
| Total Revenue | Total Profit | |
|---|---|---|
| Year | ||
| 2010 | 19186023 | 6629568 |
| 2011 | 11129165 | 2741008 |
| 2012 | 31898644 | 9213009 |
| 2013 | 20330447 | 6715420 |
| 2014 | 16630217 | 5879462 |
| 2015 | 12427983 | 3996540 |
| 2016 | 12372866 | 4903838 |
| 2017 | 13373421 | 4089354 |
data1.groupby(by=['Month-Year']) ['Total Profit'].sum().head(15)
Month-Year Apr-11 693912 Apr-12 971008 Apr-13 632513 Apr-14 1838546 Apr-15 624230 Aug-12 248406 Aug-13 23150 Aug-14 306098 Aug-15 1622 Dec-10 641587 Dec-13 53253 Dec-16 1661390 Feb-10 1424411 Feb-11 127723 Feb-12 1553767 Name: Total Profit, dtype: int64
plt.figure(figsize=(5,3))
sns.scatterplot(x=data1['Units Sold'],y=data1['Total Profit'])
<Axes: xlabel='Units Sold', ylabel='Total Profit'>
plt.figure(figsize=(5,3))
sns.scatterplot(x=data1['Total Revenue'],y=data1['Units Sold'])
<Axes: xlabel='Total Revenue', ylabel='Units Sold'>
plt.figure(figsize=(5,3))
sns.scatterplot(x=data1['Total Cost'],y=data1['Total Profit'])
<Axes: xlabel='Total Cost', ylabel='Total Profit'>
da1=data1.describe()
sns.heatmap(np.round(da1.corr(),2),annot=True)
<Axes: >
#Region wise Total of Orders ID ------- Pending
da2=data1.groupby(by=['Region'])['Order ID'].count()
da2
Region Asia 11 Australia and Oceania 11 Central America and the Caribbean 7 Europe 22 Middle East and North Africa 10 North America 3 Sub-Saharan Africa 36 Name: Order ID, dtype: int64
sns.set(rc={'figure.figsize':(20,5)})
sns.histplot(data1['Order ID'],bins=30,kde=True)
<Axes: xlabel='Order ID', ylabel='Count'>
sns.histplot(data1['Total Cost'],bins=30,kde=True)
<Axes: xlabel='Total Cost', ylabel='Count'>
sns.histplot(data1['Total Profit'],bins=30,kde=True)
<Axes: xlabel='Total Profit', ylabel='Count'>
region= data1.groupby(by=['Region'])['Total Profit'].sum().sort_values(ascending=False).reset_index()
plt.figure(figsize=(20,5))
sns.barplot(x='Region',y='Total Profit',data =region,palette='viridis')
plt.yscale('log')
plt.title('Region wise Profit',fontsize=20,color='b')
plt.xlabel('Region',fontsize=10,color='r')
plt.ylabel('Total Profit',fontsize=10,color='r')
plt.show()
sold_Region=data.groupby(['Region','Item Type'],as_index=False)['Total Profit'].sum().sort_values(by='Total Profit',ascending=False)
#sns.set(rc={'figure.figsize':(15,5)})
DF=sns.barplot(x='Region',y='Total Profit',data=sold_Region)
for bars in DF.containers:
DF.bar_label(bars)
df=data1.groupby(by=['Country'])[['Total Revenue','Total Profit']].sum().head(20).sort_values(by='Total Profit',ascending=False)
sns.set(rc={'figure.figsize':(20,5)})
df.plot(kind='bar')
plt.title('Country Wise Total Revenue and Total Profit',fontsize=20,color='b')
plt.xlabel('Region',fontsize=15,color='r')
plt.ylabel('Total Profit',fontsize=15,color='r')
plt.show()
plt.figure(figsize=(20,5))
df1=data.groupby(by=['Order Priority'])['Total Profit'].sum()
fig=plt.figure(figsize=(3.5,4))
df1=data.groupby(by=['Order Priority']).size()
explode=(0,0.05,0.04,0)
colors=('g','c','y','b')
df1.plot(kind='pie',explode=explode,colors=colors,autopct='%1.2F%%',startangle=190)
plt.axis('equal')
plt.show()
<Figure size 2000x500 with 0 Axes>
Da5=data1.groupby(['Region','Sales Channel'],as_index=False)['Total Profit'].sum().sort_values(by='Total Profit')
sns.set(rc={'figure.figsize':(3,3)})
ax=sns.barplot(x='Sales Channel',y='Total Profit',data = Da5)
for bars in ax.containers:
ax.bar_label(bars)
da7=px.bar(data1,x='Item Type',y='Total Profit',color='Sales Channel',barmode='group',width=1000,height=500)
da7
D5=data1.groupby(by=['Region','Sales Channel'])[['Total Revenue','Total Profit']].sum().sort_values(by='Region',ascending=False)
D5.plot(kind="bar")
<Axes: xlabel='Region,Sales Channel'>
sns.set(rc={'figure.figsize':(20,5)})
Count=data['Item Type'].value_counts()
Count.plot(kind='bar')
plt.title('Total of Items Types',fontsize=20,color='b')
plt.xlabel('Items',fontsize=15,color='r')
plt.ylabel('Total',fontsize=15,color='r')
plt.show()
DF3=data.groupby(by=['Item Type'])[['Total Cost','Total Profit']].sum()
sns.set(rc={'figure.figsize':(20,5)})
DF3.plot(kind="bar")
plt.title('Items wise Total Cost and Total profit',fontsize=20,color='b')
plt.xlabel('Year',fontsize=15,color='r')
plt.ylabel('Total profit',fontsize=15,color='r')
plt.show()
df4=data.groupby(by=['Year'])['Total Profit'].sum().sort_values(ascending=False)
sns.set(rc={'figure.figsize':(20,5)})
df4.plot(kind='bar')
#ax=sns.barplot(x='Year',y='Total Profit',data = df4)
plt.title('Year wise Total Revenue and Total profit',fontsize=20,color='b')
plt.xlabel('Year',fontsize=15,color='r')
plt.ylabel('Total profit',fontsize=15,color='r')
plt.show()
df5=data.groupby(['Month-Year'])['Total Profit'].sum().sort_values(ascending=False)
sns.set(rc={'figure.figsize':(20,5)})
df5.plot(kind='bar')
#sns.barplot(x='MYear',y='Total Profit',data = Year,palette='viridis')
plt.title('Month-Year wise Total Revenue and Total profit',fontsize=20,color='b')
plt.xlabel('Year',fontsize=15,color='r')
plt.ylabel('Total profit',fontsize=15,color='r')
plt.show()
da8=px.bar(data1,x='Region',y='Total Profit',color='Item Type',barmode='group',width=1000,height=500)
da8
plt.figure(figsize=(7,5))
sns.scatterplot(x='Total Cost', y='Total Profit', data=data1, hue='Item Type');
df10=sns.boxplot(x=data1['Unit Price'],y=data1['Item Type'])
sns.set(rc={'figure.figsize':(20,5)})
sns.pairplot(data1)
<seaborn.axisgrid.PairGrid at 0x17a53c6e390>